This is the first part of analysis including for the paper “Rates and patterns of mutation in tandem repetitive DNA in six independent lineages of Chamydomonas reinhardtii”. This file goes through the genome-wide description of tandem repeats in 6 different strains of Chlamydomonas reinhardtii.
This will read in the file “reads.summary.txt” containing the columns:
a. File = MA line name (fastq file name)
b. Repreads = Number of reads with repeats as identified by k-seek
c. Totreads = Total number of reads sequenced for that line
d. Proportion = Repreads/totreads (percent repetitive)
setwd("/Users/ses425/Dropbox/Cornell/1.Projects/1.Chlamy-USE_BOX_FOLDER_INSTEAD/Manuscript/GitHub/Chlamy-repeats-mutation-master")
reads.summary <- read.delim("./Input_FileS1/reads.summary.txt", header=FALSE)
colnames(reads.summary) <- c("File", "Repreads", "Totreads", "Proportion")
We assessed a total of 2.564885210^{9} reads.
The total number of repetitive reads (uncorrected) is 4.813310^{6}.
The mean proportion of reads that are repetitive per sample (not corrected) is 0.18.
hist(reads.summary$Proportion)
If yes, this confirms that we need to correct for sequencing effort across libraries.
plot(x=reads.summary$Totreads, y=reads.summary$Repreads, xlab="Total reads", ylab="Reads with tandem repeats")
summary(lm(formula = reads.summary$Repreads ~ reads.summary$Totreads))
##
## Call:
## lm(formula = reads.summary$Repreads ~ reads.summary$Totreads)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31284 -5975 126 7195 33872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.456e+04 3.063e+03 -4.753 8.33e-06 ***
## reads.summary$Totreads 2.359e-03 9.507e-05 24.813 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9899 on 83 degrees of freedom
## Multiple R-squared: 0.8812, Adjusted R-squared: 0.8798
## F-statistic: 615.7 on 1 and 83 DF, p-value: < 2.2e-16
# p < 2.2e-16, R^2 = 0.88
Yes. There is a significant relationship that needs to be corrected.
This dataset is the counts of all kmers found by k-Seek in the 85 MA lines analyzed. Before running k-Seek, the data have been trimmed using trimmomatic to remove low quality sequences. The kmer counts have been normalized by kmer sequence GC content and sequencing library depth. Here we add sample names to each library and sort the kmers by mean abundance. We will be working with the data_sorted file (kmers sorted by their mean abundance).
#read in data, give appropriate rownames
data <- read.csv('./Input_FileS1/Chlamy_trimmed_GCcorr.compiled')
file_names <- as.vector(data$X)
rownames(data) <- file_names
data[,1] <- NULL
# match the file names to the names of the lines
Chlamy_file_info <- read.delim("./Input_FileS1/Chlamy_file_info.txt", header=FALSE)
Chlamy_lines <- as.vector(Chlamy_file_info[,2])
Chlamy_files <- as.vector(Chlamy_file_info[,1])
names(Chlamy_lines) <- Chlamy_files
Chlamy_lines <- as.data.frame(Chlamy_lines)
# format the data and name the rows by the MA line ID
data_withnames <- merge(data, Chlamy_lines, by=0)
lines <- data_withnames[,3314]
data_withnames <- data_withnames[,-c(1,3314)]
rownames(data_withnames) <- lines
data_withnames <- as.matrix(data_withnames)
# sort the columns (kmers) by their mean abundance
kmer_means <- colMeans(data_withnames)
data_sorted <- data_withnames[,order(kmer_means, decreasing=T)]
Some MA lines had their kmer counts corrected strongly down for GC contents ~50%. Check to see if it appears like these MA lines appear to have been over-corrected in the counts of kmers 50%. Look at the AC repeat in particular. From below, correction looks good.
# Some MA lines were strongly corrected near 50% GC - check if this seems reasonable by the corrected copy number
data_sorted["CC-2937_MA1", 1:5]
## AC AGC AAAACCCT C AGGCGG
## 44678 6549 1980 6801 313
data_sorted["CC-2937_MA2", 1:5]
## AC AGC AAAACCCT C AGGCGG
## 43609 7169 2118 3165 239
For each ancestral lineage, get the absolute amount of kmer content in kb. Use all the kmers that have at least 2 copies (normalized) in all the MA lines
data_1373 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-1373"), ]
data_1952 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-1952"), ]
data_2342 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-2342"), ]
data_2344 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-2344"), ]
data_2931 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-2931"), ]
data_2937 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-2937"), ]
#FUNCTIONS
get_legit_kmer_indexes <- function (kmer_mx) {
indexes <- c()
for (i in 1:ncol(kmer_mx)) { ## go through each column of the data
p <- kmer_mx[,i] >= 2 ## at least 2 copies
if (sum(p)==nrow(kmer_mx)) { # if all samples have at least 2 copies
indexes <- c(indexes, i)
}
}
return(indexes)
}
#get matrix with absolute kmer content (multiplied by length of kmer)
get_abskmer_mx <- function (kmer_mx, indexes) {
kmers.abs <- matrix(nrow=nrow(kmer_mx), ncol=max(indexes))
for (i in indexes) {
for (j in 1:nrow(kmer_mx)) {
kmers.abs[j,i] <- kmer_mx[j,i] * (nchar(colnames(kmer_mx)[i], type="chars"))
}
}
kmers.abs <- kmers.abs[,colSums(is.na(kmers.abs)) != nrow(kmers.abs)] # get rid of columns that are all NAs
return(kmers.abs)
}
# get total kmer content per MA line
get_total_kmercontent <- function (abs_mx) {
total.kmers.abs <- c()
for (i in 1:nrow(abs_mx)){
total.kmers.abs[i] <- sum(abs_mx[i,])/10^3
}
names(total.kmers.abs) <- rownames(abs_mx)
return (total.kmers.abs)
}
legit.kmer.indexes_1373 <- get_legit_kmer_indexes(data_1373)
N_k2_1373<-length(legit.kmer.indexes_1373)
abs.mx_1373 <- get_abskmer_mx(data_1373, legit.kmer.indexes_1373)
total.kmercontent_1373 <- get_total_kmercontent(abs.mx_1373)
Mean_k2_cont_1373<-mean(total.kmercontent_1373)
Mean_k2_cont_per_line_1373<-total.kmercontent_1373 # note: this is in kb
legit.kmer.indexes_1952 <- get_legit_kmer_indexes(data_1952)
N_k2_1952<-length(legit.kmer.indexes_1952)
abs.mx_1952 <- get_abskmer_mx(data_1952, legit.kmer.indexes_1952)
total.kmercontent_1952 <- get_total_kmercontent(abs.mx_1952)
Mean_k2_cont_1952<-mean(total.kmercontent_1952) # note: this is in kb
Mean_k2_cont_per_line_1952<-total.kmercontent_1952 # note: this is in kb
legit.kmer.indexes_2342 <- get_legit_kmer_indexes(data_2342)
N_k2_2342<-length(legit.kmer.indexes_2342)
abs.mx_2342 <- get_abskmer_mx(data_2342, legit.kmer.indexes_2342)
total.kmercontent_2342 <- get_total_kmercontent(abs.mx_2342)
Mean_k2_cont_2342<-mean(total.kmercontent_2342) # note: this is in kb
Mean_k2_cont_per_line_2342<-total.kmercontent_2342 # note: this is in kb
legit.kmer.indexes_2344 <- get_legit_kmer_indexes(data_2344)
N_k2_2344<-length(legit.kmer.indexes_2344)
abs.mx_2344 <- get_abskmer_mx(data_2344, legit.kmer.indexes_2344)
total.kmercontent_2344 <- get_total_kmercontent(abs.mx_2344)
Mean_k2_cont_2344<-mean(total.kmercontent_2344)
Mean_k2_cont_per_line_2344<-total.kmercontent_2344 # note: this is in kb
legit.kmer.indexes_2931 <- get_legit_kmer_indexes(data_2931)
N_k2_2931<-length(legit.kmer.indexes_2931)
abs.mx_2931 <- get_abskmer_mx(data_2931, legit.kmer.indexes_2931)
total.kmercontent_2931 <- get_total_kmercontent(abs.mx_2931)
Mean_k2_cont_2931<-mean(total.kmercontent_2931)
Mean_k2_cont_per_line_2931<-total.kmercontent_2931 # note: this is in kb
legit.kmer.indexes_2937 <- get_legit_kmer_indexes(data_2937)
N_k2_2937<-length(legit.kmer.indexes_2937)
abs.mx_2937 <- get_abskmer_mx(data_2937, legit.kmer.indexes_2937)
total.kmercontent_2937 <- get_total_kmercontent(abs.mx_2937)
Mean_k2_cont_2937<-mean(total.kmercontent_2937)
Mean_k2_cont_per_line_2937<-total.kmercontent_2937 # note: this is in kb
ancestors<-c("CC-1373", "CC-1952", "CC-2342", "CC-2344", "CC-2931", "CC-2937")
N_k2<-c(N_k2_1373,N_k2_1952,N_k2_2342,N_k2_2344,N_k2_2931,N_k2_2937)
Mean_cont_per_anc<-c(Mean_k2_cont_1373,Mean_k2_cont_1952,Mean_k2_cont_2342,Mean_k2_cont_2344,Mean_k2_cont_2931,Mean_k2_cont_2937)
common.kmer.df<-data.frame(Ancestor=ancestors,N_kmers_over2=N_k2, Kmer_content=Mean_cont_per_anc)
stargazer(common.kmer.df, type="html", summary=FALSE)
| Ancestor | N_kmers_over2 | Kmer_content | |
| 1 | CC-1373 | 150 | 149.286 |
| 2 | CC-1952 | 134 | 186.541 |
| 3 | CC-2342 | 134 | 189.768 |
| 4 | CC-2344 | 170 | 192.691 |
| 5 | CC-2931 | 171 | 190.839 |
| 6 | CC-2937 | 198 | 173.761 |
Mean number of kmers across lines is 159.5.
Mean kmer content across lines is 180.481026 kb.
For a genome of 120 Mb, this equates to 0.1491579% of the genome. Range: 0.1233772 to 0.1592488%.
# Make table of kmer content per line, then plot by ancestor
category <- c(
rep ("CC-1373", times = nrow(data_1373)),
rep ("CC-1952", times = nrow(data_1952)),
rep ("CC-2342", times = nrow(data_2342)),
rep ("CC-2344", times = nrow(data_2344)),
rep ("CC-2931", times = nrow(data_2931)),
rep ("CC-2937", times = nrow(data_2937))
)
total.kmercontent_all <- c(total.kmercontent_1373, total.kmercontent_1952, total.kmercontent_2342, total.kmercontent_2344, total.kmercontent_2931, total.kmercontent_2937)
#length(total.kmercontent_all) #should be 85 if all lines are accounted for
TotalAbs_df <- data.frame (total.kmercontent_all, category)
par(mar=c(5,5,3,1))
boxplot (TotalAbs_df[,1] ~ TotalAbs_df[,2], data=TotalAbs_df, las=2, cex.axis=0.8, ylab="Total tandem repeat content (kb)", Main = "Per MA line absolute total mutation rate", col="deepskyblue3")
#get the kmer >= 2 into a datafile
df.1373=data_1373[,legit.kmer.indexes_1373]
df.1373.melted=melt(df.1373)
colnames(df.1373.melted) = c("Line", "Kmer", "Copy_number")
#plot one line
#ggplot(df.1373.melted[which(df.1373.melted$Line == "CC-1373_MA1"),], aes(x=log10(Copy_number), color=Line)) +
# geom_histogram() +
# facet_wrap(~Line)
#plot all lines
ggplot(df.1373.melted, aes(x=log10(Copy_number), color=Line)) +
geom_histogram() +
facet_wrap(~Line) +
theme(legend.position = "none") +
ggtitle("MA lines derived from CC_1373")
#get the kmer >= 2 into a datafile
df.1952=data_1952[,legit.kmer.indexes_1952]
df.1952.melted=melt(df.1952)
colnames(df.1952.melted) = c("Line", "Kmer", "Copy_number")
#plot all lines
ggplot(df.1952.melted, aes(x=log10(Copy_number), color=Line)) +
geom_histogram() +
facet_wrap(~Line) +
theme(legend.position = "none") +
ggtitle("MA lines derived from CC_1952")
#get the kmer >= 2 into a datafile
df.2342=data_2342[,legit.kmer.indexes_2342]
df.2342.melted=melt(df.2342)
colnames(df.2342.melted) = c("Line", "Kmer", "Copy_number")
#plot all lines
ggplot(df.2342.melted, aes(x=log10(Copy_number), color=Line)) +
geom_histogram() +
facet_wrap(~Line) +
theme(legend.position = "none") +
ggtitle("MA lines derived from CC_2342")
#get the kmer >= 2 into a datafile
df.2344=data_2344[,legit.kmer.indexes_2344]
df.2344.melted=melt(df.2344)
colnames(df.2344.melted) = c("Line", "Kmer", "Copy_number")
#plot all lines
ggplot(df.2344.melted, aes(x=log10(Copy_number), color=Line)) +
geom_histogram() +
facet_wrap(~Line) +
theme(legend.position = "none") +
ggtitle("MA lines derived from CC_2344")
#get the kmer >= 2 into a datafile
df.2931=data_2931[,legit.kmer.indexes_2931]
df.2931.melted=melt(df.2931)
colnames(df.2931.melted) = c("Line", "Kmer", "Copy_number")
#plot all lines
ggplot(df.2931.melted, aes(x=log10(Copy_number), color=Line)) +
geom_histogram() +
facet_wrap(~Line) +
theme(legend.position = "none") +
ggtitle("MA lines derived from CC_2931")
#get the kmer >= 2 into a datafile
df.2937=data_2937[,legit.kmer.indexes_2937]
df.2937.melted=melt(df.2937)
colnames(df.2937.melted) = c("Line", "Kmer", "Copy_number")
#plot all lines
ggplot(df.2937.melted, aes(x=log10(Copy_number), color=Line)) +
geom_histogram() +
facet_wrap(~Line) +
theme(legend.position = "none") +
ggtitle("MA lines derived from CC_2937")
get_1000_kmers <- function (kmer_matrix) {
common.kmer.indexes <- c()
for (i in 1:ncol(kmer_matrix)) {
if (mean(kmer_matrix[,i])>=1000) {
common.kmer.indexes <- c(common.kmer.indexes, i)
}
}
names(common.kmer.indexes) <- colnames(kmer_matrix)[common.kmer.indexes]
return (common.kmer.indexes)
}
kmer1000.indexes_1373 <- get_1000_kmers(data_1373)
k1000_1373<-length(kmer1000.indexes_1373) # length 4
kmer1000.indexes_1952 <- get_1000_kmers(data_1952)
k1000_1952<-length(kmer1000.indexes_1952) # length 5
kmer1000.indexes_2342 <- get_1000_kmers(data_2342)
k1000_2342<-length(kmer1000.indexes_2342) # length 6
kmer1000.indexes_2344 <- get_1000_kmers(data_2344)
k1000_2344<-length(kmer1000.indexes_2344) # length 3
kmer1000.indexes_2931 <- get_1000_kmers(data_2931)
k1000_2391<-length(kmer1000.indexes_2931) # length 7
kmer1000.indexes_2937 <- get_1000_kmers(data_2937)
k1000_2397<-length(kmer1000.indexes_2937) # length 5
k1000s<-c(k1000_1373,k1000_1952,k1000_2342,k1000_2344,k1000_2391,k1000_2397)
common.kmer.df$N_kmers_over1000<-k1000s
stargazer(common.kmer.df, type="html", summary=FALSE)
| Ancestor | N_kmers_over2 | Kmer_content | N_kmers_over1000 | |
| 1 | CC-1373 | 150 | 149.286 | 4 |
| 2 | CC-1952 | 134 | 186.541 | 5 |
| 3 | CC-2342 | 134 | 189.768 | 6 |
| 4 | CC-2344 | 170 | 192.691 | 3 |
| 5 | CC-2931 | 171 | 190.839 | 7 |
| 6 | CC-2937 | 198 | 173.761 | 5 |
df.k1000_1373=data_1373[,which(apply(data_1373, 2, mean) >=1000)]
df.k1000_1952=data_1952[,which(apply(data_1952, 2, mean) >=1000)]
df.k1000_2342=data_2342[,which(apply(data_2342, 2, mean) >=1000)]
df.k1000_2344=data_2344[,which(apply(data_2344, 2, mean) >=1000)]
df.k1000_2931=data_2931[,which(apply(data_2931, 2, mean) >=1000)]
df.k1000_2937=data_2937[,which(apply(data_2937, 2, mean) >=1000)]
stargazer(df.k1000_1373, type="html", summary=FALSE)
| AC | AGC | AAAACCCT | AAATAACAAT | |
| CC-1373_MA1 | 26,012 | 5,395 | 1,533 | 1,193 |
| CC-1373_MA2 | 27,632 | 5,600 | 1,529 | 1,110 |
| CC-1373_MA3 | 31,828 | 5,520 | 1,051 | 843 |
| CC-1373_MA4 | 27,486 | 4,378 | 1,067 | 737 |
| CC-1373_MA5 | 31,074 | 5,602 | 1,388 | 1,452 |
| CC-1373_MA6 | 20,110 | 5,460 | 1,245 | 1,193 |
| CC-1373_MA7 | 20,204 | 5,553 | 1,040 | 2,018 |
| CC-1373_MA8 | 23,282 | 5,525 | 1,205 | 914 |
| CC-1373_MA10 | 30,943 | 5,309 | 1,191 | 1,069 |
| CC-1373_MA11 | 26,113 | 5,593 | 1,678 | 2,095 |
| CC-1373_MA13 | 20,102 | 5,456 | 1,489 | 973 |
| CC-1373_MA14 | 34,684 | 5,471 | 1,312 | 1,337 |
| CC-1373_MA15 | 24,011 | 5,446 | 1,329 | 711 |
stargazer(df.k1000_1952, type="html", summary=FALSE)
| AC | AGC | AAAACCCT | C | AAATTTATATACTCC | |
| CC-1952_MA1 | 35,987 | 6,546 | 1,856 | 2,276 | 702 |
| CC-1952_MA2 | 34,929 | 7,259 | 1,828 | 156 | 771 |
| CC-1952_MA3 | 37,201 | 7,443 | 1,877 | 216 | 553 |
| CC-1952_MA4 | 25,561 | 6,583 | 3,050 | 42 | 630 |
| CC-1952_MA5 | 20,052 | 6,478 | 2,340 | 133 | 770 |
| CC-1952_MA6 | 27,441 | 6,877 | 1,893 | 240 | 537 |
| CC-1952_MA7 | 25,441 | 6,688 | 2,267 | 431 | 1,261 |
| CC-1952_MA8 | 14,749 | 6,294 | 993 | 7,711 | 3,063 |
| CC-1952_MA10 | 38,330 | 6,769 | 3,192 | 79 | 1,753 |
| CC-1952_MA11 | 20,893 | 6,281 | 2,232 | 243 | 546 |
| CC-1952_MA12 | 29,318 | 6,620 | 1,953 | 124 | 1,215 |
| CC-1952_MA13 | 27,513 | 6,414 | 1,947 | 3,195 | 756 |
| CC-1952_MA14 | 25,832 | 6,690 | 1,945 | 132 | 1,743 |
| CC-1952_MA15 | 32,621 | 6,671 | 1,975 | 208 | 814 |
stargazer(df.k1000_2342, type="html", summary=FALSE)
| AC | AGC | AAAACCCT | C | AGGCGG | AAATAGCAGTATAT | |
| CC-2342_MA2 | 27,495 | 6,542 | 1,544 | 457 | 945 | 373 |
| CC-2342_MA3 | 31,747 | 5,932 | 1,861 | 2,459 | 2,424 | 1,294 |
| CC-2342_MA4 | 28,322 | 5,846 | 2,018 | 2,864 | 2,361 | 6,268 |
| CC-2342_MA5 | 51,735 | 6,784 | 1,324 | 13,656 | 1,802 | 523 |
| CC-2342_MA7 | 42,453 | 6,692 | 1,107 | 859 | 933 | 872 |
| CC-2342_MA8 | 35,689 | 6,193 | 1,285 | 625 | 917 | 1,340 |
| CC-2342_MA9 | 47,907 | 6,102 | 1,934 | 2,357 | 1,454 | 720 |
| CC-2342_MA10 | 23,678 | 6,242 | 1,960 | 824 | 1,294 | 3,790 |
| CC-2342_MA11 | 26,074 | 6,215 | 2,054 | 649 | 1,322 | 2,992 |
| CC-2342_MA12 | 33,869 | 5,582 | 1,390 | 0 | 916 | 4,167 |
| CC-2342_MA13 | 19,013 | 6,467 | 1,928 | 326 | 827 | 2,035 |
| CC-2342_MA15 | 28,926 | 6,460 | 1,361 | 82 | 989 | 1,293 |
| CC-2342_MA1 | 54,612 | 6,729 | 1,359 | 12,161 | 1,498 | 434 |
| CC-2342_MA14 | 39,318 | 6,080 | 1,325 | 1,853 | 1,256 | 858 |
stargazer(df.k1000_2344, type="html", summary=FALSE)
| AC | AGC | AAAACCCT | |
| CC-2344_MA1 | 43,243 | 6,629 | 5,310 |
| CC-2344_MA2 | 35,268 | 6,495 | 3,789 |
| CC-2344_MA3 | 30,043 | 6,194 | 4,485 |
| CC-2344_MA4 | 41,318 | 6,078 | 3,228 |
| CC-2344_MA5 | 28,845 | 6,084 | 3,527 |
| CC-2344_MA6 | 30,731 | 6,431 | 3,216 |
| CC-2344_MA7 | 30,779 | 6,505 | 3,797 |
| CC-2344_MA8 | 29,821 | 6,349 | 4,207 |
| CC-2344_MA9 | 40,716 | 6,649 | 4,140 |
| CC-2344_MA10 | 36,762 | 6,292 | 4,102 |
| CC-2344_MA11 | 52,595 | 6,156 | 4,280 |
| CC-2344_MA12 | 30,038 | 6,568 | 3,541 |
| CC-2344_MA13 | 37,310 | 6,512 | 3,920 |
| CC-2344_MA14 | 26,239 | 6,455 | 4,630 |
| CC-2344_MA15 | 35,658 | 6,452 | 4,020 |
stargazer(df.k1000_2931, type="html", summary=FALSE)
| AC | AGC | AAAACCCT | C | AGGCGG | AAT | AAATAGCAGTATAT | |
| CC-2931_MA1 | 47,594 | 6,836 | 2,147 | 1,600 | 1,062 | 1,970 | 3,450 |
| CC-2931_MA2 | 22,896 | 6,520 | 1,919 | 802 | 1,065 | 319 | 3,690 |
| CC-2931_MA3 | 19,061 | 6,079 | 2,762 | 627 | 820 | 401 | 2,122 |
| CC-2931_MA4 | 43,800 | 6,930 | 1,760 | 2,704 | 987 | 290 | 940 |
| CC-2931_MA5 | 44,342 | 6,695 | 2,105 | 1,404 | 1,144 | 214 | 736 |
| CC-2931_MA6 | 33,487 | 6,580 | 1,258 | 1,521 | 1,155 | 3 | 299 |
| CC-2931_MA7 | 25,288 | 6,717 | 1,704 | 51 | 961 | 1,700 | 1,096 |
| CC-2931_MA9 | 23,416 | 6,711 | 2,176 | 106 | 771 | 733 | 2,137 |
| CC-2931_MA10 | 22,260 | 6,215 | 2,019 | 3,174 | 1,187 | 1,581 | 3,523 |
| CC-2931_MA11 | 25,600 | 6,185 | 2,466 | 3,164 | 1,113 | 1,639 | 2,603 |
| CC-2931_MA12 | 41,723 | 7,593 | 3,581 | 531 | 625 | 1,132 | 906 |
| CC-2931_MA13 | 29,600 | 6,292 | 1,736 | 3,238 | 1,154 | 1,746 | 1,289 |
| CC-2931_MA14 | 32,308 | 6,896 | 1,297 | 1,693 | 1,438 | 2,773 | 2,859 |
| CC-2931_MA15 | 30,671 | 7,037 | 1,020 | 302 | 1,144 | 107 | 898 |
stargazer(df.k1000_2937, type="html", summary=FALSE)
| AC | AGC | AAAACCCT | C | AAT | |
| CC-2937_MA1 | 44,678 | 6,549 | 1,980 | 6,801 | 352 |
| CC-2937_MA2 | 43,609 | 7,169 | 2,118 | 3,165 | 504 |
| CC-2937_MA3 | 25,113 | 7,038 | 1,975 | 325 | 1,087 |
| CC-2937_MA4 | 27,794 | 7,236 | 2,003 | 244 | 1,561 |
| CC-2937_MA5 | 25,929 | 7,120 | 2,229 | 201 | 1,402 |
| CC-2937_MA6 | 27,392 | 7,403 | 2,083 | 197 | 1,036 |
| CC-2937_MA7 | 20,357 | 6,630 | 1,561 | 108 | 230 |
| CC-2937_MA8 | 28,095 | 6,202 | 2,249 | 772 | 743 |
| CC-2937_MA9 | 16,459 | 6,252 | 2,071 | 241 | 1,740 |
| CC-2937_MA10 | 19,726 | 6,685 | 2,403 | 1,019 | 860 |
| CC-2937_MA11 | 21,322 | 6,002 | 2,418 | 143 | 1,575 |
| CC-2937_MA12 | 25,583 | 6,700 | 2,017 | 410 | 1,121 |
| CC-2937_MA13 | 16,381 | 6,703 | 2,234 | 1,257 | 1,022 |
| CC-2937_MA14 | 32,067 | 6,424 | 2,111 | 962 | 1,046 |
| CC-2937_MA15 | 29,429 | 6,630 | 1,746 | 2,066 | 950 |
AC_copy_1373=mean (data_1373[,which(colnames(data_1373) == "AC")])
AC_copy_1952=mean (data_1952[,which(colnames(data_1952) == "AC")])
AC_copy_2342=mean (data_2342[,which(colnames(data_2342) == "AC")])
AC_copy_2344=mean (data_2344[,which(colnames(data_2344) == "AC")])
AC_copy_2931=mean (data_2931[,which(colnames(data_2931) == "AC")])
AC_copy_2937=mean (data_2937[,which(colnames(data_2937) == "AC")])
AC_copy_number=c(AC_copy_1373,AC_copy_1952,AC_copy_2342,AC_copy_2344,AC_copy_2931,AC_copy_2937)
common.kmer.df$AC_copy_number=AC_copy_number
AC is the most abundant (by an order of magnitude).
It has a mean copy number of 3.059207910^{4} across lines derived from a single ancestor.
AC is the most abundant repeat in all ancestors.
stargazer(common.kmer.df,type="html", summary=FALSE)
| Ancestor | N_kmers_over2 | Kmer_content | N_kmers_over1000 | AC_copy_number | |
| 1 | CC-1373 | 150 | 149.286 | 4 | 26,421.620 |
| 2 | CC-1952 | 134 | 186.541 | 5 | 28,276.290 |
| 3 | CC-2342 | 134 | 189.768 | 6 | 35,059.860 |
| 4 | CC-2344 | 170 | 192.691 | 3 | 35,291.070 |
| 5 | CC-2931 | 171 | 190.839 | 7 | 31,574.710 |
| 6 | CC-2937 | 198 | 173.761 | 5 | 26,928.930 |
Using principal components and NJ trees with kmers >= 2, do relationships reflect ancestral relationships? #### A: PCA clusters lines unequivocally by ancestor The only close ones are CC-1952 and CC-2342.
#PCA
legit_kmers_1373=colnames(df.1373)
legit_kmers_1952=colnames(df.1952)
legit_kmers_2342=colnames(df.2342)
legit_kmers_2344=colnames(df.2344)
legit_kmers_2931=colnames(df.2931)
legit_kmers_2937=colnames(df.2937)
legit_kmers=c(legit_kmers_1373,legit_kmers_1952,legit_kmers_2342,legit_kmers_2344,legit_kmers_2931,legit_kmers_2937)
legit_idx<-match(unique(legit_kmers), colnames(data_sorted))
get_ancestor <- function(row_number) {
Anc <- substr(rownames(data_sorted)[row_number], start = 1, stop = 7)
return (Anc)
}
category <- c()
for (i in 1:nrow(data_sorted)) {
category <- c(category, get_ancestor(i))
}
pca.data <- prcomp(data_sorted[, legit_idx], scale.=T)
# using the 46 union kmers
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = category, var.axes = F, alpha=0.8)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
Abundant kmers are those with an average of at least 100 copies across MA lines of a single ancestor.
# function that takes in a matrix of copy numbers and returns the indexes of kmers that have a mean copy number across all MA lines of at least 100
get_common_kmers <- function (kmer_matrix) {
common.kmer.indexes <- c()
for (i in 1:ncol(kmer_matrix)) {
if (mean(kmer_matrix[,i])>=100) {
common.kmer.indexes <- c(common.kmer.indexes, i)
}
}
names(common.kmer.indexes) <- colnames(kmer_matrix)[common.kmer.indexes]
return (common.kmer.indexes)
}
# make a new matrix for each ancestral group
data_1373 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-1373"), ]
data_1952 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-1952"), ]
data_2342 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-2342"), ]
data_2344 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-2344"), ]
data_2931 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-2931"), ]
data_2937 <- data_sorted[which (substr(rownames(data_sorted), start = 1, stop = 7) == "CC-2937"), ]
# get the common kmer indexes for each ancestral group, and how many each has
common.kmer.indexes_1373 <- get_common_kmers(data_1373)
length(common.kmer.indexes_1373) # 28
## [1] 28
common.kmer.indexes_1952 <- get_common_kmers(data_1952)
length(common.kmer.indexes_1952) # 31
## [1] 31
common.kmer.indexes_2342 <- get_common_kmers(data_2342)
length(common.kmer.indexes_2342) # 31
## [1] 31
common.kmer.indexes_2344 <- get_common_kmers(data_2344)
length(common.kmer.indexes_2344) # 26
## [1] 26
common.kmer.indexes_2931 <- get_common_kmers(data_2931)
length(common.kmer.indexes_2931) # 29
## [1] 29
common.kmer.indexes_2937 <- get_common_kmers(data_2937)
length(common.kmer.indexes_2937) # 29
## [1] 29
shared.common.kmers <- Reduce(intersect, list
(names(common.kmer.indexes_1373), names(common.kmer.indexes_1952),
names(common.kmer.indexes_2342), names(common.kmer.indexes_2344),
names(common.kmer.indexes_2931), names(common.kmer.indexes_2937)
))
shared.common.kmers
## [1] "AC" "AGC" "AAAACCCT" "C" "AGGCGG" "AAT"
## [7] "AGGC" "ACGC" "AAC" "ACACGC" "AGGCGC" "ACCGGC"
## [13] "CCG" "ACC" "AGGGC" "ACCCGC" "AGCGGC" "AGG"
length(shared.common.kmers)
## [1] 18
#write(shared.common.kmers, file="/Users/jullienflynn/Documents/AndyLab/Chlamy/R_Analysis/data_in/chlamy_shared.common.kmers.txt", sep="\n")
common.kmer.union <- Reduce(union, list
(names(common.kmer.indexes_1373), names(common.kmer.indexes_1952),
names(common.kmer.indexes_2342), names(common.kmer.indexes_2344),
names(common.kmer.indexes_2931), names(common.kmer.indexes_2937)
))
common.kmer.union
## [1] "AC" "AGC" "AAAACCCT"
## [4] "C" "AGGCGG" "AAT"
## [7] "AGGC" "ACGC" "AAC"
## [10] "ACACGC" "AAATACCTTACGGGAATAT" "AGGCGC"
## [13] "ACCGGC" "CCG" "ACC"
## [16] "AGGGC" "ACCCGC" "AAATAACAAT"
## [19] "AGCGGC" "AAAGT" "AGG"
## [22] "ACAGGG" "ATC" "ACGCCC"
## [25] "AAATACTTACGGGAATAT" "AGCGCC" "AGCGGG"
## [28] "ACCCGG" "AAATTTATATACTCC" "ACAGGC"
## [31] "ACCAGC" "AAAGAAGTATATAAAT" "AAAATATATAAATATAGCT"
## [34] "AGCCGG" "AGCCGC" "AATAGATAATAT"
## [37] "AAATAGCAGTATAT" "AAAAAGGTAAATGTATTTAT" "ACGCC"
## [40] "AGGGCC" "AGGGGC" "AGAGGC"
## [43] "AAAATAAAGTGT" "ACTGGGC" "AAACAC"
## [46] "AAAC"
length(common.kmer.union)
## [1] 46
# get the indexes of the common kmers
common.kmer.union.indexes <- Reduce(union, list
(common.kmer.indexes_1373, common.kmer.indexes_1952,
common.kmer.indexes_2342, common.kmer.indexes_2344,
common.kmer.indexes_2931, common.kmer.indexes_2937
))
Use the union of common kmers (46), as well as the overall top 100 kmers to make the PCA.
get_ancestor <- function(row_number) {
Anc <- substr(rownames(data_sorted)[row_number], start = 1, stop = 7)
return (Anc)
}
category <- c()
for (i in 1:nrow(data_sorted)) {
category <- c(category, get_ancestor(i))
}
pca.data <- prcomp(data_sorted[, common.kmer.union.indexes], scale.=T)
# using the 46 union kmers
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = category, var.axes = F)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
# what are these 4 samples that are separated from the rest of their group?
MA_names <- rownames(data_2342)
pca.data_2342 <- prcomp(data_2342[, common.kmer.indexes_2342], scale.=T)
g <- ggbiplot(pca.data_2342, obs.scale = 1, var.scale = 1, groups = MA_names, var.axes = F)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
data_2342[,1:10]
## AC AGC AAAACCCT C AGGCGG AAT AGGC AAATAGCAGTATAT
## CC-2342_MA2 27495 6542 1544 457 945 886 463 373
## CC-2342_MA3 31747 5932 1861 2459 2424 62 2034 1294
## CC-2342_MA4 28322 5846 2018 2864 2361 228 2006 6268
## CC-2342_MA5 51735 6784 1324 13656 1802 949 1058 523
## CC-2342_MA7 42453 6692 1107 859 933 1987 595 872
## CC-2342_MA8 35689 6193 1285 625 917 1528 559 1340
## CC-2342_MA9 47907 6102 1934 2357 1454 854 598 720
## CC-2342_MA10 23678 6242 1960 824 1294 2125 410 3790
## CC-2342_MA11 26074 6215 2054 649 1322 1613 365 2992
## CC-2342_MA12 33869 5582 1390 0 916 226 440 4167
## CC-2342_MA13 19013 6467 1928 326 827 650 376 2035
## CC-2342_MA15 28926 6460 1361 82 989 627 406 1293
## CC-2342_MA1 54612 6729 1359 12161 1498 699 885 434
## CC-2342_MA14 39318 6080 1325 1853 1256 311 719 858
## ACGC AAC
## CC-2342_MA2 320 343
## CC-2342_MA3 1745 382
## CC-2342_MA4 1148 939
## CC-2342_MA5 607 987
## CC-2342_MA7 484 367
## CC-2342_MA8 482 332
## CC-2342_MA9 595 342
## CC-2342_MA10 345 1078
## CC-2342_MA11 320 1226
## CC-2342_MA12 313 266
## CC-2342_MA13 231 284
## CC-2342_MA15 319 332
## CC-2342_MA1 561 943
## CC-2342_MA14 616 351
# likely MA1 and MA5 are outliters because of poly-C repeat, MA3 and MA4 because of AGGC repeat
# what about the 1952 sample?
MA_names <- rownames(data_1952)
pca.data_1952 <- prcomp(data_1952[, common.kmer.indexes_1952], scale.=T)
g <- ggbiplot(pca.data_1952, obs.scale = 1, var.scale = 1, groups = MA_names, var.axes = F)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
data_1952[,1:10]
## AC AGC AAAACCCT C AGGCGG AAT AGGC AAATAGCAGTATAT ACGC
## CC-1952_MA1 35987 6546 1856 2276 654 642 677 1 763
## CC-1952_MA2 34929 7259 1828 156 595 1045 763 0 680
## CC-1952_MA3 37201 7443 1877 216 538 611 742 0 660
## CC-1952_MA4 25561 6583 3050 42 310 229 454 0 267
## CC-1952_MA5 20052 6478 2340 133 347 233 545 0 332
## CC-1952_MA6 27441 6877 1893 240 492 961 546 0 397
## CC-1952_MA7 25441 6688 2267 431 597 1211 636 0 500
## CC-1952_MA8 14749 6294 993 7711 474 1848 923 0 842
## CC-1952_MA10 38330 6769 3192 79 576 727 720 0 659
## CC-1952_MA11 20893 6281 2232 243 388 636 535 0 372
## CC-1952_MA12 29318 6620 1953 124 444 1545 643 2 544
## CC-1952_MA13 27513 6414 1947 3195 542 36 511 1 577
## CC-1952_MA14 25832 6690 1945 132 386 1159 505 0 457
## CC-1952_MA15 32621 6671 1975 208 388 603 560 0 439
## AAC
## CC-1952_MA1 290
## CC-1952_MA2 257
## CC-1952_MA3 241
## CC-1952_MA4 317
## CC-1952_MA5 260
## CC-1952_MA6 297
## CC-1952_MA7 256
## CC-1952_MA8 179
## CC-1952_MA10 836
## CC-1952_MA11 302
## CC-1952_MA12 345
## CC-1952_MA13 326
## CC-1952_MA14 311
## CC-1952_MA15 281
# CC1952-MA9 is an outlier probably mainly driven by lower number of telomere repeat
# Now using PCs 2-3
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = category, var.axes = F, choices=2:3)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = category, var.axes = F, choices=c(1,3))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
# Using the overall top 100 kmers
pca.data <- prcomp(data_sorted[, 1:100], scale.=T)
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = category, var.axes = F)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = category, var.axes = F, choices=2:3)
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
g <- ggbiplot(pca.data, obs.scale = 1, var.scale = 1, groups = category, var.axes = F, choices=c(1,3))
g <- g + scale_color_discrete(name='')
g <- g + theme(legend.direction='horizontal', legend.position='top')
print(g)
Choose a few kmers to illustrate
Fig1_kmers <- c("AGGCGG", "AAC", "ACACGC", "ACCGGC", "AGGCGC")
Ancestors <- c(rep("CC-1373", times=5), rep("CC-1952", times=5), rep("CC-2342", times=5), rep("CC-2344", times=5), rep("CC-2931", times=5), rep("CC-2937", times=5))
#Ancestors
# make a data frame
means_1373 <- c()
means_1952 <- c()
means_2342 <- c()
means_2344 <- c()
means_2931 <- c()
means_2937 <- c()
for (i in 1:length(Fig1_kmers)) {
means_1373 <- c(means_1373, mean(data_1373[,Fig1_kmers[i]]))
means_1952 <- c(means_1952, mean(data_1952[,Fig1_kmers[i]]))
means_2342 <- c(means_2342, mean(data_2342[,Fig1_kmers[i]]))
means_2344 <- c(means_2344, mean(data_2344[,Fig1_kmers[i]]))
means_2931 <- c(means_2931, mean(data_2931[,Fig1_kmers[i]]))
means_2937 <- c(means_2937, mean(data_2937[,Fig1_kmers[i]]))
}
means_fig1 <- c(means_1373, means_1952, means_2342, means_2344, means_2931, means_2937)
Fig1_df <- data.frame(Ancestors, Fig1_kmers, means_fig1)
ggplot(Fig1_df, aes(fill=Ancestors, x=Fig1_kmers, y=means_fig1)) +
geom_bar(position="dodge", stat="identity") +
ylab("Mean copy number") +
scale_fill_brewer(palette="Set1") +
theme_bw(base_size = 14) +
theme(axis.title.x=element_blank())
# function to calculate length of string
Calc_kmer_length <- function (kmer_string) {
return (nchar(kmer_string))
}
# function to calculate GC content of a kmer
Calc_GC_content <- function (kmer_string) {
string <- BString(kmer_string)
gc.content <- c(letterFrequency(string, letters=c("CG"), as.prob=F)/(length(string)))
return(gc.content)
}
common.kmer.union.lengths <- sapply(common.kmer.union, Calc_kmer_length)
common.kmer.union.gc <- sapply(common.kmer.union, Calc_GC_content)
hist(common.kmer.union.lengths, breaks=11)
hist(common.kmer.union.gc, breaks=10)
# 6mers are the most abundant
length(which(common.kmer.union.lengths == 6))
## [1] 19
qplot(x=common.kmer.union.lengths, geom="histogram", bins=15) + xlab("Kmer unit length") +
theme_bw() +
theme(axis.text = element_text(size=13), axis.title = element_text(size=13))
qplot(x=common.kmer.union.gc, geom="histogram", bins=15) + scale_x_reverse() + xlab("GC of kmer") +
theme_bw() +
theme(axis.text = element_text(size=13), axis.title = element_text(size=13))
# Make an extra table
table2 <- data.frame(common.kmer.union, common.kmer.union.lengths, common.kmer.union.gc)
#write.table(table2, file = "/Users/jullienflynn/Documents/AndyLab/Chlamy/ManuscriptFigures/Table2.txt", row.names = F, quote=F)
Maybe some repeats are coming from the chloroplast given their GC contents and the previous finding that the chloroplast DOES contain repeats.
mean(common.kmer.union.gc)
## [1] 0.5661754
hist (sapply (names(which(common.kmer.union.lengths == 6)), Calc_GC_content), main="6-mer GC", xlab="GC")
hist (sapply (names(which(common.kmer.union.lengths != 6)), Calc_GC_content), main="non-6mer GC", xlab="GC")
# all the longer kmers >=10 bp are GC < 0.35
hist (sapply (names(which(common.kmer.union.lengths >= 10)), Calc_GC_content), main="10+ mer GC", xlab="GC")
hist (sapply (names(which(common.kmer.union.lengths <= 9)), Calc_GC_content), main="<9 mer GC", xlab="GC")
length(which(common.kmer.union.lengths <= 9))
## [1] 36
mean(sapply (names(which(common.kmer.union.lengths >= 10)), Calc_GC_content))
## [1] 0.1738116
# average GC is 17.4 !!
max(sapply (names(which(common.kmer.union.lengths >= 10)), Calc_GC_content))
## [1] 0.3157895
mean(sapply (names(which(common.kmer.union.lengths <= 9)), Calc_GC_content))
## [1] 0.6751653
# make Figure 2c
assign_group <- function(length){
if (length >= 10) {
group <- 1
}
else {
group <- 2
}
return(group)
}
group_assignments <- sapply (common.kmer.union.lengths, assign_group)
group_assignments
## AC AGC AAAACCCT
## 2 2 2
## C AGGCGG AAT
## 2 2 2
## AGGC ACGC AAC
## 2 2 2
## ACACGC AAATACCTTACGGGAATAT AGGCGC
## 2 1 2
## ACCGGC CCG ACC
## 2 2 2
## AGGGC ACCCGC AAATAACAAT
## 2 2 1
## AGCGGC AAAGT AGG
## 2 2 2
## ACAGGG ATC ACGCCC
## 2 2 2
## AAATACTTACGGGAATAT AGCGCC AGCGGG
## 1 2 2
## ACCCGG AAATTTATATACTCC ACAGGC
## 2 1 2
## ACCAGC AAAGAAGTATATAAAT AAAATATATAAATATAGCT
## 2 1 1
## AGCCGG AGCCGC AATAGATAATAT
## 2 2 1
## AAATAGCAGTATAT AAAAAGGTAAATGTATTTAT ACGCC
## 1 1 2
## AGGGCC AGGGGC AGAGGC
## 2 2 2
## AAAATAAAGTGT ACTGGGC AAACAC
## 1 2 2
## AAAC
## 2
cp_plot <- data.frame(lengths=common.kmer.union.lengths, gc=common.kmer.union.gc, grp=as.factor(group_assignments))
ggplot (data = cp_plot, aes(y = cp_plot$gc, x = cp_plot$lengths, color = cp_plot$grp )) +
geom_jitter(size=3, alpha=0.5) +
xlab("Kmer unit length") +
ylab("GC content") +
theme_bw() +
theme(axis.text = element_text(size=14), axis.title = element_text(size=14), legend.position="none")
temp <- which(colnames (data_1373)=="AAAACCCT")
telomere_abun_1373 <- data.frame(data_1373[,temp], "CC-1373")
colnames(telomere_abun_1373) <- c("copynumber", "Ancestor")
telomere_mean_1373 <- mean(data_1373[,temp])
temp <- which(colnames (data_1952)=="AAAACCCT")
telomere_abun_1952 <- data.frame(data_1952[,temp], "CC-1952")
colnames(telomere_abun_1952) <- c("copynumber", "Ancestor")
telomere_mean_1952 <- mean(data_1952[,temp])
temp <- which(colnames (data_2342)=="AAAACCCT")
telomere_abun_2342 <- data.frame(data_2342[,temp], "CC-2342")
colnames(telomere_abun_2342) <- c("copynumber", "Ancestor")
telomere_mean_2342 <- mean(data_2342[,temp])
temp <- which(colnames (data_2344)=="AAAACCCT")
telomere_abun_2344 <- data.frame(data_2344[,temp], "CC-2344")
colnames(telomere_abun_2344) <- c("copynumber", "Ancestor")
telomere_mean_2344 <- mean(data_2344[,temp])
temp <- which(colnames (data_2931)=="AAAACCCT")
telomere_abun_2931 <- data.frame(data_2931[,temp], "CC-2931")
colnames(telomere_abun_2931) <- c("copynumber", "Ancestor")
telomere_mean_2931 <- mean(data_2931[,temp])
temp <- which(colnames (data_2937)=="AAAACCCT")
telomere_abun_2937 <- data.frame(data_2937[,temp], "CC-2937")
colnames(telomere_abun_2937) <- c("copynumber", "Ancestor")
telomere_mean_2937 <- mean(data_2937[,temp])
telomere_abun_df <- rbind(telomere_abun_1373, telomere_abun_1952, telomere_abun_2342,
telomere_abun_2344, telomere_abun_2931, telomere_abun_2937)
boxplot(telomere_abun_df[,1] ~ telomere_abun_df[,2], data=telomere_abun_df, las=2, ylab = "Telomere copy number")
# seems like CC-2344 has higher telomere copy numbers
telomere_model <- aov(copynumber ~ Ancestor, data = telomere_abun_df)
summary(telomere_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## Ancestor 5 65453584 13090717 63.77 <2e-16 ***
## Residuals 79 16217709 205287
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# anova indicates significant difference
posthoc_telomere <- TukeyHSD(x=telomere_model, conf.level = 0.95)
posthoc_telomere
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = copynumber ~ Ancestor, data = telomere_abun_df)
##
## $Ancestor
## diff lwr upr p adj
## CC-1952-CC-1373 784.20879 274.4852 1293.932358 0.0003319
## CC-2342-CC-1373 291.49451 -218.2291 801.218072 0.5552132
## CC-2344-CC-1373 2700.72308 2199.2470 3202.199172 0.0000000
## CC-2931-CC-1373 684.35165 174.6281 1194.075215 0.0024886
## CC-2937-CC-1373 767.78974 266.3136 1269.265838 0.0003595
## CC-2342-CC-1952 -492.71429 -992.9095 7.480897 0.0559447
## CC-2344-CC-1952 1916.51429 1424.7263 2408.302228 0.0000000
## CC-2931-CC-1952 -99.85714 -600.0523 400.338040 0.9918848
## CC-2937-CC-1952 -16.41905 -508.2070 475.368895 0.9999987
## CC-2344-CC-2342 2409.22857 1917.4406 2901.016514 0.0000000
## CC-2931-CC-2342 392.85714 -107.3380 893.052326 0.2088176
## CC-2937-CC-2342 476.29524 -15.4927 968.083181 0.0632172
## CC-2931-CC-2344 -2016.37143 -2508.1594 -1524.583486 0.0000000
## CC-2937-CC-2344 -1932.93333 -2416.1678 -1449.698877 0.0000000
## CC-2937-CC-2931 83.43810 -408.3498 575.226038 0.9962013
help("TukeyHSD")
hist(telomere_abun_1373$copynumber)
hist(telomere_abun_1952$copynumber)
hist(telomere_abun_2342$copynumber)
hist(telomere_abun_2344$copynumber)
hist(telomere_abun_2931$copynumber)
hist(telomere_abun_2937$copynumber)
get_residuals <- function (abun_mx) {
x <- abun_mx$copynumber - mean(abun_mx$copynumber)
return(x)
}
residuals_1373 <- get_residuals(telomere_abun_1373)
residuals_1952 <- get_residuals(telomere_abun_1952)
residuals_2342 <- get_residuals(telomere_abun_2342)
residuals_2344 <- get_residuals(telomere_abun_2344)
residuals_2931 <- get_residuals(telomere_abun_2931)
residuals_2937 <- get_residuals(telomere_abun_2937)
residuals_all <- c(residuals_1373, residuals_1952, residuals_2342, residuals_2344, residuals_2931, residuals_2937)
hist(residuals_all)
# great, the residuals are approximately normally distributed, parametric test is fine to do
# For Figure 1B
mean(data_1373[,"AAATACCTTACGGGAATAT"])
## [1] 390.2308
mean(data_1952[,"AAATACCTTACGGGAATAT"])
## [1] 509.1429
mean(data_2342[,"AAATACCTTACGGGAATAT"])
## [1] 0.07142857
mean(data_2344[,"AAATACCTTACGGGAATAT"])
## [1] 170.9333
mean(data_2931[,"AAATACCTTACGGGAATAT"])
## [1] 0.5714286
mean(data_2937[,"AAATACCTTACGGGAATAT"])
## [1] 929.4
mean(data_1373[,"AAATAACAAT"])
## [1] 1203.462
mean(data_1952[,"AAATAACAAT"])
## [1] 0.4285714
mean(data_2342[,"AAATAACAAT"])
## [1] 0.1428571
mean(data_2344[,"AAATAACAAT"])
## [1] 0.1333333
mean(data_2931[,"AAATAACAAT"])
## [1] 0.1428571
mean(data_2937[,"AAATAACAAT"])
## [1] 0.06666667
mean(data_1373[,"ATC"])
## [1] 275.2308
mean(data_1952[,"ATC"])
## [1] 108.0714
mean(data_2342[,"ATC"])
## [1] 0.1428571
mean(data_2344[,"ATC"])
## [1] 76.93333
mean(data_2931[,"ATC"])
## [1] 323.7857
mean(data_2937[,"ATC"])
## [1] 34.26667
mean(data_1373[,"AAATACTTACGGGAATAT"])
## [1] 351.3077
mean(data_1952[,"AAATACTTACGGGAATAT"])
## [1] 148.0714
mean(data_2342[,"AAATACTTACGGGAATAT"])
## [1] 0.1428571
mean(data_2344[,"AAATACTTACGGGAATAT"])
## [1] 0.1333333
mean(data_2931[,"AAATACTTACGGGAATAT"])
## [1] 0.07142857
mean(data_2937[,"AAATACTTACGGGAATAT"])
## [1] 171.3333
mean(data_1373[,"ACCCGG"])
## [1] 103.3846
mean(data_1952[,"ACCCGG"])
## [1] 0
mean(data_2342[,"ACCCGG"])
## [1] 7.642857
mean(data_2344[,"ACCCGG"])
## [1] 0
mean(data_2931[,"ACCCGG"])
## [1] 0
mean(data_2937[,"ACCCGG"])
## [1] 72.13333
mean(data_1373[,"AAATTTATATACTCC"])
## [1] 0.1538462
mean(data_1952[,"AAATTTATATACTCC"])
## [1] 1079.571
mean(data_2342[,"AAATTTATATACTCC"])
## [1] 0.5
mean(data_2344[,"AAATTTATATACTCC"])
## [1] 597.2
mean(data_2931[,"AAATTTATATACTCC"])
## [1] 0
mean(data_2937[,"AAATTTATATACTCC"])
## [1] 0.2
mean(data_1373[,"AAAGAAGTATATAAAT"])
## [1] 0.2307692
mean(data_1952[,"AAAGAAGTATATAAAT"])
## [1] 902.0714
mean(data_2342[,"AAAGAAGTATATAAAT"])
## [1] 0.1428571
mean(data_2344[,"AAAGAAGTATATAAAT"])
## [1] 0.06666667
mean(data_2931[,"AAAGAAGTATATAAAT"])
## [1] 0.1428571
mean(data_2937[,"AAAGAAGTATATAAAT"])
## [1] 0.1333333
mean(data_1373[,"AAAATATATAAATATAGCT"])
## [1] 0.1538462
mean(data_1952[,"AAAATATATAAATATAGCT"])
## [1] 583.2143
mean(data_2342[,"AAAATATATAAATATAGCT"])
## [1] 0.07142857
mean(data_2344[,"AAAATATATAAATATAGCT"])
## [1] 74.13333
mean(data_2931[,"AAAATATATAAATATAGCT"])
## [1] 0
mean(data_2937[,"AAAATATATAAATATAGCT"])
## [1] 0.06666667
mean(data_1373[,"AATAGATAATAT"])
## [1] 0
mean(data_1952[,"AATAGATAATAT"])
## [1] 111.7857
mean(data_2342[,"AATAGATAATAT"])
## [1] 0
mean(data_2344[,"AATAGATAATAT"])
## [1] 0.06666667
mean(data_2931[,"AATAGATAATAT"])
## [1] 0
mean(data_2937[,"AATAGATAATAT"])
## [1] 0.06666667
mean(data_1373[,"AAATAGCAGTATAT"])
## [1] 0.6153846
mean(data_1952[,"AAATAGCAGTATAT"])
## [1] 0.2857143
mean(data_2342[,"AAATAGCAGTATAT"])
## [1] 1925.643
mean(data_2344[,"AAATAGCAGTATAT"])
## [1] 0.5333333
mean(data_2931[,"AAATAGCAGTATAT"])
## [1] 1896.286
mean(data_2937[,"AAATAGCAGTATAT"])
## [1] 0.4
mean(data_1373[,"AAAAAGGTAAATGTATTTAT"])
## [1] 0.1538462
mean(data_1952[,"AAAAAGGTAAATGTATTTAT"])
## [1] 0.07142857
mean(data_2342[,"AAAAAGGTAAATGTATTTAT"])
## [1] 346
mean(data_2344[,"AAAAAGGTAAATGTATTTAT"])
## [1] 0.06666667
mean(data_2931[,"AAAAAGGTAAATGTATTTAT"])
## [1] 317.7857
mean(data_2937[,"AAAAAGGTAAATGTATTTAT"])
## [1] 0.2
mean(data_1373[,"AAAAAGGTAAATGTATTTAT"])
## [1] 0.1538462
mean(data_1952[,"AAAAAGGTAAATGTATTTAT"])
## [1] 0.07142857
mean(data_2342[,"AAAAAGGTAAATGTATTTAT"])
## [1] 346
mean(data_2344[,"AAAAAGGTAAATGTATTTAT"])
## [1] 0.06666667
mean(data_2931[,"AAAAAGGTAAATGTATTTAT"])
## [1] 317.7857
mean(data_2937[,"AAAAAGGTAAATGTATTTAT"])
## [1] 0.2
mean(data_1373[,"AGGGCC"])
## [1] 0.2307692
mean(data_1952[,"AGGGCC"])
## [1] 33.14286
mean(data_2342[,"AGGGCC"])
## [1] 386.2143
mean(data_2344[,"AGGGCC"])
## [1] 9.4
mean(data_2931[,"AGGGCC"])
## [1] 13.21429
mean(data_2937[,"AGGGCC"])
## [1] 1.466667
mean(data_1373[,"AGAGGC"])
## [1] 3
mean(data_1952[,"AGAGGC"])
## [1] 1.5
mean(data_2342[,"AGAGGC"])
## [1] 125.0714
mean(data_2344[,"AGAGGC"])
## [1] 0
mean(data_2931[,"AGAGGC"])
## [1] 0
mean(data_2937[,"AGAGGC"])
## [1] 7.733333
mean(data_1373[,"AAAATAAAGTGT"])
## [1] 0.2307692
mean(data_1952[,"AAAATAAAGTGT"])
## [1] 0.8571429
mean(data_2342[,"AAAATAAAGTGT"])
## [1] 0.3571429
mean(data_2344[,"AAAATAAAGTGT"])
## [1] 846.1333
mean(data_2931[,"AAAATAAAGTGT"])
## [1] 646.2857
mean(data_2937[,"AAAATAAAGTGT"])
## [1] 0.4666667
mean(data_1373[,"ACTGGGC"])
## [1] 0
mean(data_1952[,"ACTGGGC"])
## [1] 0
mean(data_2342[,"ACTGGGC"])
## [1] 0
mean(data_2344[,"ACTGGGC"])
## [1] 0
mean(data_2931[,"ACTGGGC"])
## [1] 121.1429
mean(data_2937[,"ACTGGGC"])
## [1] 0
mean(data_1373[,"AAACAC"])
## [1] 8.461538
mean(data_1952[,"AAACAC"])
## [1] 0
mean(data_2342[,"AAACAC"])
## [1] 26.71429
mean(data_2344[,"AAACAC"])
## [1] 5.666667
mean(data_2931[,"AAACAC"])
## [1] 58.28571
mean(data_2937[,"AAACAC"])
## [1] 131.6
mean(data_1373[,"AAAC"])
## [1] 7.846154
mean(data_1952[,"AAAC"])
## [1] 0
mean(data_2342[,"AAAC"])
## [1] 29
mean(data_2344[,"AAAC"])
## [1] 0
mean(data_2931[,"AAAC"])
## [1] 0.2142857
mean(data_2937[,"AAAC"])
## [1] 170.4
For each ancestral lineage, get the absolute amount of kmer content in kb. Use all the kmers that have at least 2 copies (normalized) in all the MA lines
# get kmers that will be used for the overall genome-wide description of kmers in the 6 ancestors
# have at least 2 normalized copies in all MA lines from the a given ancestor
get_legit_kmer_indexes <- function (kmer_mx) {
indexes <- c()
for (i in 1:ncol(kmer_mx)) { ## go through each column of the data
p <- kmer_mx[,i] >= 2 ## at least 2 copies
if (sum(p)==nrow(kmer_mx)) { # if all samples have at least 2 copies
indexes <- c(indexes, i)
}
}
return(indexes)
}
#get matrix with absolute kmer content (multiplied by length of kmer)
get_abskmer_mx <- function (kmer_mx, indexes) {
kmers.abs <- matrix(nrow=nrow(kmer_mx), ncol=max(indexes))
for (i in indexes) {
for (j in 1:nrow(kmer_mx)) {
kmers.abs[j,i] <- kmer_mx[j,i] * (nchar(colnames(kmer_mx)[i], type="chars"))
}
}
kmers.abs <- kmers.abs[,colSums(is.na(kmers.abs)) != nrow(kmers.abs)] # get rid of columns that are all NAs
return(kmers.abs)
}
# get total kmer content per MA line
get_total_kmercontent <- function (abs_mx) {
total.kmers.abs <- c()
for (i in 1:nrow(abs_mx)){
total.kmers.abs[i] <- sum(abs_mx[i,])/10^3
}
names(total.kmers.abs) <- rownames(abs_mx)
return (total.kmers.abs)
}
legit.kmer.indexes_1373 <- get_legit_kmer_indexes(data_1373)
length(legit.kmer.indexes_1373)
## [1] 150
abs.mx_1373 <- get_abskmer_mx(data_1373, legit.kmer.indexes_1373)
total.kmercontent_1373 <- get_total_kmercontent(abs.mx_1373)
mean(total.kmercontent_1373)
## [1] 149.2864
total.kmercontent_1373 # note: this is in kb
## [1] 159.611 150.749 153.285 134.614 161.199 134.988 141.093 131.020
## [9] 163.460 178.961 127.271 172.295 132.177
legit.kmer.indexes_1952 <- get_legit_kmer_indexes(data_1952)
length(legit.kmer.indexes_1952)
## [1] 134
abs.mx_1952 <- get_abskmer_mx(data_1952, legit.kmer.indexes_1952)
total.kmercontent_1952 <- get_total_kmercontent(abs.mx_1952)
mean(total.kmercontent_1952) # note: this is in kb
## [1] 186.5406
legit.kmer.indexes_2342 <- get_legit_kmer_indexes(data_2342)
length(legit.kmer.indexes_2342)
## [1] 134
abs.mx_2342 <- get_abskmer_mx(data_2342, legit.kmer.indexes_2342)
total.kmercontent_2342 <- get_total_kmercontent(abs.mx_2342)
mean(total.kmercontent_2342) # note: this is in kb
## [1] 189.7683
legit.kmer.indexes_2344 <- get_legit_kmer_indexes(data_2344)
length(legit.kmer.indexes_2344)
## [1] 170
abs.mx_2344 <- get_abskmer_mx(data_2344, legit.kmer.indexes_2344)
total.kmercontent_2344 <- get_total_kmercontent(abs.mx_2344)
mean(total.kmercontent_2344)
## [1] 192.691
legit.kmer.indexes_2931 <- get_legit_kmer_indexes(data_2931)
length(legit.kmer.indexes_2931)
## [1] 171
abs.mx_2931 <- get_abskmer_mx(data_2931, legit.kmer.indexes_2931)
total.kmercontent_2931 <- get_total_kmercontent(abs.mx_2931)
mean(total.kmercontent_2931)
## [1] 190.8386
legit.kmer.indexes_2937 <- get_legit_kmer_indexes(data_2937)
length(legit.kmer.indexes_2937)
## [1] 198
abs.mx_2937 <- get_abskmer_mx(data_2937, legit.kmer.indexes_2937)
total.kmercontent_2937 <- get_total_kmercontent(abs.mx_2937)
mean(total.kmercontent_2937)
## [1] 173.7612
# Now, make a boxplot of this.
category <- c(
rep ("CC-1373", times = nrow(data_1373)),
rep ("CC-1952", times = nrow(data_1952)),
rep ("CC-2342", times = nrow(data_2342)),
rep ("CC-2344", times = nrow(data_2344)),
rep ("CC-2931", times = nrow(data_2931)),
rep ("CC-2937", times = nrow(data_2937))
)
total.kmercontent_all <- c(total.kmercontent_1373, total.kmercontent_1952, total.kmercontent_2342, total.kmercontent_2344, total.kmercontent_2931, total.kmercontent_2937)
length(total.kmercontent_all)
## [1] 85
TotalAbs_df <- data.frame (total.kmercontent_all, category)
par(mar=c(5,5,3,1))
boxplot (TotalAbs_df[,1] ~ TotalAbs_df[,2], data=TotalAbs_df, las=2, cex.axis=0.8, ylab="Total tandem repeat content (kb)", Main = "Per MA line absolute total mutation rate", col="deepskyblue3")
i.e. could similarity in mean copy number explain the similarity in correlation patterns.
ANSWER: NO.
shared_corr_kmers <- c("AGGC", "AGGCGG", "ACGC", "ACGCC", "ACGCCC", "ACCGCC", "ACCCGC", "ACACGC", "ACAGGC", "ACAGGG", "ACCAGC", "AGGCGC")
data_1373[,shared_corr_kmers]
## AGGC AGGCGG ACGC ACGCC ACGCCC ACCGCC ACCCGC ACACGC ACAGGC
## CC-1373_MA1 492 659 367 74 120 6 128 374 76
## CC-1373_MA2 584 698 362 97 92 33 154 331 82
## CC-1373_MA3 535 741 499 149 146 39 203 432 94
## CC-1373_MA4 656 772 534 106 149 0 113 375 100
## CC-1373_MA5 455 664 324 55 105 14 120 360 60
## CC-1373_MA6 615 683 334 61 97 23 153 344 73
## CC-1373_MA7 446 714 297 41 75 33 99 296 63
## CC-1373_MA8 440 626 300 79 129 16 107 263 63
## CC-1373_MA10 650 793 538 104 150 15 202 468 102
## CC-1373_MA11 441 904 353 43 80 13 104 370 72
## CC-1373_MA13 459 635 256 46 83 19 79 258 76
## CC-1373_MA14 579 849 534 105 175 31 150 452 90
## CC-1373_MA15 472 634 397 56 109 31 116 352 72
## ACAGGG ACCAGC AGGCGC
## CC-1373_MA1 123 82 449
## CC-1373_MA2 131 94 447
## CC-1373_MA3 161 148 358
## CC-1373_MA4 125 87 222
## CC-1373_MA5 113 92 515
## CC-1373_MA6 144 80 408
## CC-1373_MA7 109 74 476
## CC-1373_MA8 121 59 512
## CC-1373_MA10 191 125 403
## CC-1373_MA11 156 80 463
## CC-1373_MA13 92 68 506
## CC-1373_MA14 169 115 394
## CC-1373_MA15 114 70 438
shared_corr_kmers_1373_means <- colMeans(data_1373[,shared_corr_kmers])
shared_corr_kmers_1373_means
## AGGC AGGCGG ACGC ACGCC ACGCCC ACCGCC ACCCGC
## 524.92308 720.92308 391.92308 78.15385 116.15385 21.00000 132.92308
## ACACGC ACAGGC ACAGGG ACCAGC AGGCGC
## 359.61538 78.69231 134.53846 90.30769 430.07692
shared_corr_kmers_1952_means <- colMeans(data_1952[,shared_corr_kmers])
shared_corr_kmers_2342_means <- colMeans(data_2342[,shared_corr_kmers])
shared_corr_kmers_2344_means <- colMeans(data_2344[,shared_corr_kmers])
shared_corr_kmers_2931_means <- colMeans(data_2931[,shared_corr_kmers])
shared_corr_kmers_2937_means <- colMeans(data_2937[,shared_corr_kmers])
shared_corr_kmers_df <- data.frame(shared_corr_kmers_1373_means, shared_corr_kmers_1952_means, shared_corr_kmers_2342_means, shared_corr_kmers_2344_means, shared_corr_kmers_2931_means, shared_corr_kmers_2937_means)
colnames(shared_corr_kmers_df) <- c("1373", "1952", "2342", "2344", "2931", "2937")
plot(shared_corr_kmers_df["AGGC",])
plot(shared_corr_kmers_df)
shared_corr_kmers_df
shared_corr_kmers_corr <- cor(shared_corr_kmers_df)
shared_corr_kmers_corr
## 1373 1952 2342 2344 2931 2937
## 1373 1.0000000 0.8752619 0.9121997 0.9414233 0.9333322 0.8127379
## 1952 0.8752619 1.0000000 0.7929938 0.8432117 0.8637707 0.7735567
## 2342 0.9121997 0.7929938 1.0000000 0.9807948 0.9546691 0.7339158
## 2344 0.9414233 0.8432117 0.9807948 1.0000000 0.9622563 0.7610236
## 2931 0.9333322 0.8637707 0.9546691 0.9622563 1.0000000 0.8280011
## 2937 0.8127379 0.7735567 0.7339158 0.7610236 0.8280011 1.0000000
# the means of these kmers are not the most correlated.